#census_api_key(Sys.getenv("CENSUS_API_KEY"), install = TRUE)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
#Abhay
#Employment - K202301 for 2021
# Get ACS data
df1 <- get_acs(geography = "state",
table = "K202301",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
label = c("Total", "In labor force:", "Civilian labor force:", "Employed",
"Unemployed", "In Armed Forces",
"Not in labor force")
)
# Join with variable labels
# Now df_labeled will have an additional column with the variable labels
# Load variables for ACS 2022
# Sample mapping of variable codes to labels
# Replace this with the actual codes and labels
# Assuming df1 is your data frame with the estimates you've loaded
# Make sure to replace 'NAME' and 'estimate' with the actual column names from your df1
# Join your data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
left_join(variables_df1, by = "variable")
# Reshape the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# df_wide now has states as columns and the descriptive variable labels as rows
# # Calculate employment rates
# df1_wide <- df1_wide %>%
# mutate(
# EmploymentRate = `In labor force: Civilian labor force: Employed` / `Total` * 100,
# UnemploymentRate = `In labor force: Civilian labor force: Unemployed` / `Total` * 100,
# NotInLaborForceRate = `Not in labor force` / `Total` * 100
# )
#
# # Plotting employment rates
# library(ggplot2)
#
# ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
# geom_bar(stat = "identity", fill = "steelblue") +
# labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# # Plotting labor force participation
# ggplot(df1_wide, aes(x = NAME)) +
# geom_bar(aes(y = `In labor force: Civilian labor force: Employed`, fill = "Employed"), stat = "identity") +
# geom_bar(aes(y = `In labor force: Civilian labor force: Unemployed`, fill = "Unemployed"), stat = "identity") +
# geom_bar(aes(y = `In labor force: Armed Forces`, fill = "Armed Forces"), stat = "identity") +
# scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
# theme(axis.text.x = element_text(angle = 90)) +
# labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")
# ggplot(df1_wide, aes(x = UnemploymentRate)) +
# geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
# labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")
# # Correlation analysis
# employment_data <- df1_wide %>%
# select(`In labor force: Civilian labor force: Employed`, `In labor force: Civilian labor force: Unemployed`, `In labor force: Armed Forces`, `Not in labor force`)
#
# correlation_matrix <- cor(employment_data, use = "complete.obs")
#
# # Visualize the correlation matrix
# library(corrplot)
# corrplot(correlation_matrix, method = "circle")
# # Calculate employment rate
# # Assuming df1_wide or df6_wide has a column 'state' with state names or abbreviations
# # Calculate the metric you want to plot (for example, Employment Rate)
#
#
# # Assuming df1_wide has full state names in the 'NAME' column
# # Convert state names to abbreviations
# state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
# df1_wide$state <- state_abbreviations
#
# # Ensure you have calculated the metric you want to plot (e.g., Employment Rate)
# df1_wide$EmploymentRate <- df1_wide$`In labor force: Civilian labor force: Employed` / df1_wide$Total * 100
#
# # Plot the map with employment rate
# plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
# scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
# theme(legend.position = "right") +
# labs(title = "Employment Rate by State in 2021")
#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state",
table = "K201803",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
label = c("Total people", "Total With Disabilities",
"Hearing", "Vision difficulty",
"cognative", "ambulatory difficulty",
"Self-care difficulty","Independent living difficulty","No Disability")
)
# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
left_join(variables_df6, by = "variable")
# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
## NAME `Total people` Total With Disabilit…¹ Hearing `Vision difficulty`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 4957633 808071 208028 152798
## 2 Alaska 702154 92390 33397 15748
## 3 Arizona 7174053 972252 298849 180792
## 4 Arkansas 2974701 517051 142133 105624
## 5 California 38724294 4324355 1140131 844049
## 6 Colorado 5715497 640346 211803 120570
## 7 Connecticut 3557526 427014 113490 78078
## 8 Delaware 987964 130551 37933 25335
## 9 District o… 659979 76754 14429 14569
## 10 Florida 21465883 2906367 812248 555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## # `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## # `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
mutate(DisabilityRate = `Total With Disabilities` / `Total people` * 100)
# Plotting with ggplot2
library(ggplot2)
ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs") # use = "complete.obs" handles missing values
# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.92 loaded
library(corrplot)
corrplot(correlation_matrix, method = "circle")
# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
filter(NAME == "Iowa") %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))
# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)
# Summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56621 -11941 -2766 17524 73868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8200.5856 5646.4977 -1.452 0.153197
## Hearing 0.6796 0.1793 3.791 0.000436 ***
## `Vision difficulty` 1.0070 0.1644 6.126 1.87e-07 ***
## cognative -0.5426 0.2291 -2.369 0.022116 *
## `Self-care difficulty` -1.9464 0.4434 -4.390 6.59e-05 ***
## `Independent living difficulty` 1.9228 0.3144 6.115 1.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared: 0.9968, Adjusted R-squared: 0.9965
## F-statistic: 2909 on 5 and 46 DF, p-value: < 2.2e-16
lm_model
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Coefficients:
## (Intercept) Hearing
## -8200.5856 0.6796
## `Vision difficulty` cognative
## 1.0070 -0.5426
## `Self-care difficulty` `Independent living difficulty`
## -1.9464 1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.
# K-means clustering with 3 centers
set.seed(123) # For reproducibility
clusters <- kmeans(disability_data, centers = 3)
# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)
pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)
#Neha
#Education - K201501
df2 <- get_acs(geography = "state",
table = "K201501",
year = 2021,
survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associate's_degree", "Education_Bachelor's_degree", "Education_Graduate_professional degree")
)
df2_labeled <- df2 %>%
left_join(variables_df2, by = "variable")
df2_wide <- df2_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
#Esha
#Citizenship - K200501
df3<- get_acs(
geography = "state",
table = "K200501",
year = 2021,
survey = "acs1/subject",
cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
variable = c("K200501_001", "K200501_002","K200501_003"),
label = c("Total","U.S. citizen", "Not a U.S. citizen")
)
df3_labeled <- df3 %>%
left_join(variables_df3, by = "variable")
df3_wide <- df3_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
***
#Srika
#Age - K200104
df4 <- get_acs(geography = "state",
table = "K200104",
year = 2021,
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),
label = c("Total", "Age_under_18",
"Age_18_to_24", "Age_25_to_34",
"Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)
df4_labeled <- df4 %>%
left_join(variables_df4, by = "variable")
df4_wide <- df4_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state",
table = "K200104",
year = 2021,
geometry = TRUE,
resolution = "20m",
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
##
|
| | 0%
|
|====== | 9%
|
|============ | 17%
|
|================== | 26%
|
|======================== | 35%
|
|=============================== | 44%
|
|===================================== | 52%
|
|=========================================== | 61%
|
|================================================= | 70%
|
|======================================================================| 100%
Mapping_age<-Age_data%>%
filter(variable=="K200104_007")%>%
shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) +
geom_sf()+
scale_fill_distiller(palette = "RdPu",
direction = 1) +
labs(title = "Median Age by State, 2021",
caption = "Data source: 2021 1-year ACS, US Census Bureau",
fill = "ACS estimate") +
theme_void()
#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state",
table = "K202502",
year = 2021,
survey = "acs1", # removed /subject since we're referring to a specific table ID
cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
variable = c("K202502_001", "K202502_002", "K202502_003"),
label = c("Total", "Owner Occupied", "Renter Occupied")
)
# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
left_join(variables_df5, by = "variable") # Joining with the variable labels dataframe
# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# df5_wide will have states as rows and different housing-related variables as columns
merged_data <- df1_wide %>%
left_join(df2_wide, by = "NAME") %>%
left_join(df3_wide, by = "NAME")%>%
left_join(df4_wide, by = "NAME")%>%
left_join(df5_wide, by = "NAME")%>%
left_join(df6_wide, by = "NAME")
columns_to_drop <- c('Total.x', 'Total.y', 'Total.x.x', 'Total.y.y')
merged_data <- merged_data[, !(names(merged_data) %in% columns_to_drop)]
colnames(merged_data)
## [1] "NAME"
## [2] "In labor force:"
## [3] "Civilian labor force:"
## [4] "Employed"
## [5] "Unemployed"
## [6] "In Armed Forces"
## [7] "Not in labor force"
## [8] "Education_Total_students"
## [9] "Education_Below_9th grade"
## [10] "Education_9th to 12th grade_no diploma"
## [11] "Education_High_school_graduate"
## [12] "Education_Some college_no degree"
## [13] "Education_Associate's_degree"
## [14] "Education_Bachelor's_degree"
## [15] "Education_Graduate_professional degree"
## [16] "U.S. citizen"
## [17] "Not a U.S. citizen"
## [18] "Age_under_18"
## [19] "Age_18_to_24"
## [20] "Age_25_to_34"
## [21] "Age_35_to_44"
## [22] "Age_45_to_54"
## [23] "Age_55_to_64"
## [24] "Age_over_64"
## [25] "Owner Occupied"
## [26] "Renter Occupied"
## [27] "Total people"
## [28] "Total With Disabilities"
## [29] "Hearing"
## [30] "Vision difficulty"
## [31] "cognative"
## [32] "ambulatory difficulty"
## [33] "Self-care difficulty"
## [34] "Independent living difficulty"
## [35] "No Disability"
## [36] "DisabilityRate"
## [37] "Cluster"
“Socio-Economic Factors Influencing Employment in the United States: A Comprehensive State-by-State Analysis”
This project aims to analyze the impact of various socio-economic factors such as education, age, citizenship status, housing, and disabilities on employment rates across different states in the United States. Using data from the American Community Survey (ACS) 2021, the study will employ statistical techniques to identify correlations, trends, and patterns, thereby providing insights into the multifaceted nature of employment dynamics in the US.
Data Exploration and Cleaning
Continue with data cleaning and handling missing values.
Normalize the data if required for better comparison.
# Checking for missing values
sum(is.na(merged_data))
## [1] 2
# Handling missing values (example: replacing with median)
merged_data <- merged_data %>%
mutate_at(vars(-group_cols()), ~ifelse(is.na(.), median(., na.rm = TRUE), .))
# Normalizing the data (example: Min-Max Scaling)
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
merged_data_normalized <- as.data.frame(lapply(merged_data[, sapply(merged_data, is.numeric)], normalize))
Exploratory Data Analysis (EDA)
Generate summary statistics and visualizations for each dataset to understand distributions and detect outliers.
Explore the relationships between different variables using scatter plots, box plots, etc.
# Summary Statistics
summary(merged_data)
## NAME In labor force: Civilian labor force: Employed
## Length:52 Min. : 301303 Min. : 298603 Min. : 287432
## Class :character 1st Qu.: 897042 1st Qu.: 892926 1st Qu.: 845392
## Mode :character Median : 2134618 Median : 2113270 Median : 1975060
## Mean : 3259412 Mean : 3233640 Mean : 3028193
## 3rd Qu.: 3897332 3rd Qu.: 3877097 3rd Qu.: 3627645
## Max. :19961610 Max. :19805371 Max. :18156051
## Unemployed In Armed Forces Not in labor force Education_Total_students
## Min. : 11171 Min. : 1313 Min. : 160319 Min. : 395348
## 1st Qu.: 47280 1st Qu.: 4381 1st Qu.: 531030 1st Qu.: 1263471
## Median : 132957 Median : 12108 Median : 1463585 Median : 3054251
## Mean : 205447 Mean : 25771 Mean : 1930062 Mean : 4434517
## 3rd Qu.: 216516 3rd Qu.: 25365 3rd Qu.: 2270377 3rd Qu.: 5085510
## Max. :1649320 Max. :156239 Max. :11545627 Max. :26909869
## Education_Below_9th grade Education_9th to 12th grade_no diploma
## Min. : 7224 Min. : 17461
## 1st Qu.: 40727 1st Qu.: 65815
## Median : 108208 Median : 173250
## Mean : 214804 Mean : 261375
## 3rd Qu.: 212026 3rd Qu.: 294885
## Max. :2380657 Max. :1804222
## Education_High_school_graduate Education_Some college_no degree
## Min. : 70925 Min. : 56221
## 1st Qu.: 335012 1st Qu.: 263017
## Median : 813017 Median : 618456
## Mean :1166698 Mean : 852411
## 3rd Qu.:1441058 3rd Qu.:1014627
## Max. :5578997 Max. :5287901
## Education_Associate's_degree Education_Bachelor's_degree
## Min. : 15235 Min. : 73255
## 1st Qu.: 123095 1st Qu.: 243453
## Median : 273742 Median : 547876
## Mean : 389473 Mean : 941835
## 3rd Qu.: 460085 3rd Qu.:1259927
## Max. :2120275 Max. :5958030
## Education_Graduate_professional degree U.S. citizen Not a U.S. citizen
## Min. : 42363 Min. : 567083 Min. : 11069
## 1st Qu.: 158678 1st Qu.: 1820268 1st Qu.: 66793
## Median : 358179 Median : 4404876 Median : 148529
## Mean : 607922 Mean : 6059088 Mean : 411049
## 3rd Qu.: 863754 3rd Qu.: 6893788 3rd Qu.: 423286
## Max. :3779787 Max. :34499330 Max. :4738506
## Age_under_18 Age_18_to_24 Age_25_to_34 Age_35_to_44
## Min. : 116767 Min. : 51926 Min. : 70511 Min. : 78268
## 1st Qu.: 442838 1st Qu.: 169060 1st Qu.: 235319 1st Qu.: 245703
## Median : 986696 Median : 402442 Median : 589334 Median : 584114
## Mean :1423482 Mean : 587334 Mean : 874641 Mean : 848734
## 3rd Qu.:1629658 3rd Qu.: 683308 3rd Qu.:1039599 3rd Qu.: 961656
## Max. :8769779 Max. :3558188 Max. :5827869 Max. :5414686
## Age_45_to_54 Age_55_to_64 Age_over_64 Owner Occupied
## Min. : 65531 Min. : 66238 Min. : 85615 Min. : 132936
## 1st Qu.: 216349 1st Qu.: 238707 1st Qu.: 320627 1st Qu.: 524154
## Median : 524686 Median : 559544 Median : 767373 Median :1143470
## Mean : 790186 Mean : 831869 Mean :1089087 Mean :1619184
## 3rd Qu.: 888609 3rd Qu.: 958270 3rd Qu.:1272226 3rd Qu.:1912862
## Max. :4928928 Max. :4773860 Max. :5964526 Max. :7502706
## Renter Occupied Total people Total With Disabilities Hearing
## Min. : 69516 Min. : 570046 Min. : 76754 Min. : 14429
## 1st Qu.: 192915 1st Qu.: 1848408 1st Qu.: 257303 1st Qu.: 84763
## Median : 569181 Median : 4317403 Median : 660485 Median : 185961
## Mean : 856022 Mean : 6349048 Mean : 830722 Mean : 226790
## 3rd Qu.:1032259 3rd Qu.: 7285134 3rd Qu.: 979094 3rd Qu.: 288780
## Max. :5926357 Max. :38724294 Max. :4324355 Max. :1140131
## Vision difficulty cognative ambulatory difficulty Self-care difficulty
## Min. : 12915 Min. : 27746 Min. : 33324 Min. : 10601
## 1st Qu.: 48799 1st Qu.: 104446 1st Qu.: 109640 1st Qu.: 43107
## Median :119128 Median : 261010 Median : 291072 Median :114600
## Mean :158917 Mean : 323599 Mean : 400503 Mean :154262
## 3rd Qu.:193366 3rd Qu.: 374062 3rd Qu.: 462491 3rd Qu.:171742
## Max. :844049 Max. :1710305 Max. :2094886 Max. :972688
## Independent living difficulty No Disability DisabilityRate
## Min. : 23311 Min. : 490741 Min. :10.35
## 1st Qu.: 77948 1st Qu.: 1570278 1st Qu.:12.05
## Median : 229318 Median : 3608154 Median :13.46
## Mean : 296658 Mean : 5518325 Mean :13.75
## 3rd Qu.: 346728 3rd Qu.: 6306040 3rd Qu.:14.38
## Max. :1741491 Max. :34399939 Max. :22.01
## Cluster
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :1.712
## 3rd Qu.:2.000
## Max. :3.000
# Load necessary library
library(ggplot2)
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
# Loop through each column in the dataframe
for (i in names(merged_data)) {
# Convert column name to a symbol
col_sym <- sym(i)
# Check if the column is numeric for histograms
if (is.numeric(merged_data[[i]])) {
p <- ggplot(merged_data, aes(x = !!col_sym)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = paste("Histogram of", i), x = i, y = "Frequency")
# If not numeric, use bar plot for categorical data
} else {
p <- ggplot(merged_data, aes(x = !!col_sym)) +
geom_bar(fill = "blue", color = "black") +
theme_minimal() +
labs(title = paste("Bar Plot of", i), x = i, y = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels for readability
}
# Print the plot
print(p)
}
Correlation Analysis
Statistical Testing
Predictive Modeling
Build regression models to predict employment rates based on other variables.
Explore other predictive models if applicable (e.g., decision trees, random forests).
Clustering Analysis
Data Visualization and Mapping
ggplot2 and
usmap for spatial data
representation.Interpretation and Conclusion
Interpret the results from the statistical analyses and models.
Draw conclusions about the socio-economic factors affecting employment.
Reporting